Introduction

Preperations

if (!file.exists("GEO_data_cleaned.rds")) {
    rmarkdown::render("A2_SijieXu.Rmd")
}

geo_data <- readRDS("GEO_data_cleaned.rds")

# Map sample matrix
samples <- data.frame(lapply(colnames(geo_data), FUN = function(x) {
    unlist(strsplit(x, split = "_"))
}))
colnames(samples) <- colnames(geo_data)
rownames(samples) <- c("cell_type", "sample_type", "patient")
samples <- data.frame(t(samples))

Data visualization

graphic abstraction Plitas et al. (2016) The image above shows an abstraction of the data used in our analysis. RNAseq data were sampled from the patient with breast cancer, including tumor, normal breast parenchyma (NBP) and blood samples. Our study only focuses on the released tumor and NBP samples from the Illumina HiSeq 2500 platform.

# Unpivot dataset
geo_data_normalized_unpivot <- geo_data %>% log2() %>% as.data.frame() %>% tidyr::gather(key = sample, 
    value = log_cpm) %>% subset(log_cpm > 0)

# Density plot before and after nor.
ggplot2::ggplot(geo_data_normalized_unpivot, ggplot2::aes(x = log_cpm, color = sample)) + 
    ggplot2::geom_density() + ggplot2::labs(title = "Density by sample after Normalization") + 
    ggplot2::theme(legend.position = "none")

# Boxplot before and after nor.
ggplot2::ggplot(geo_data_normalized_unpivot, ggplot2::aes(x = sample, y = log_cpm, 
    color = sample)) + ggplot2::geom_boxplot() + ggplot2::coord_flip() + ggplot2::labs(title = "Boxplot by sample after Normalization") + 
    ggplot2::theme(legend.position = "none")

The data were normalized with the TTM workflow. The first plot is a density plot after normalization; it shows an approximately bimodal distribution for our dataset. The second plot is a boxplot of the data by the sample after the normalization; it shows that the data is skewed left.

# Create heatmap from cleaned data
heatmap_matrix <- geo_data %>% as.matrix() %>% t() %>% scale() %>% t() %>% na.omit()
heatmap_col = circlize::colorRamp2(c(min(heatmap_matrix), 0, max(heatmap_matrix)), 
    c("blue", "white", "red"))

overall_heatmap <- ComplexHeatmap::Heatmap(as.matrix(heatmap_matrix), show_row_dend = TRUE, 
    show_column_dend = TRUE, col = heatmap_col, show_column_names = TRUE, show_row_names = FALSE, 
    show_heatmap_legend = TRUE)

# Plot the overall heatmap
overall_heatmap

The heatmap shows quite a few gene expression variations between different patients and sample types. Thus we should process with differential gene expression analysis.

# Label the MDS plots by patient
limma::plotMDS(geo_data, labels = samples$patient, col = unlist(rainbow(10))[factor(samples$patient)], 
    main = "MDS plot by patient")

# Label the MDS plots by samples type
limma::plotMDS(geo_data, labels = samples$sample_type, col = c("red", "dark green")[factor(samples$sample_type)], 
    main = "MDS plot by sample type")

The variation between each sample type and patient is examined using MDS plot; we can see that the clustering between the same patient is closer than the clustering of the sample type. Therefore, we should be aware of possible false-positive results that are introduced by the patient variation.

Differential gene expression

Model creation

# Setup the model attributesf for EdgeR
model_design <- model.matrix(~samples$sample_type)
d <- edgeR::DGEList(counts = geo_data, group = samples$sample_type)
d <- edgeR::estimateDisp(d, model_design)

# Fit QLF model and get result
fit <- edgeR::glmQLFit(d, model_design)
fit.qlf <- edgeR::glmQLFTest(fit, coef = "samples$sample_typetumor")

knitr::kable(edgeR::topTags(fit.qlf), type = "html", row.names = FALSE, caption = "Fitted with sample type by edgeR")
Fitted with sample type by edgeR
logFC logCPM F PValue FDR
-6.179812 4.965514 59.75670 0e+00 0.0000319
-5.728405 4.076935 58.40430 0e+00 0.0000319
-7.088433 3.524745 56.66723 0e+00 0.0000319
-4.141401 5.475291 56.08041 0e+00 0.0000319
-5.887068 4.639614 47.69596 0e+00 0.0001526
-5.832741 2.498825 44.56041 1e-07 0.0002602
-5.278487 4.644875 42.56181 1e-07 0.0003279
-5.293646 1.877521 42.36472 1e-07 0.0003279
3.585678 6.099079 41.09898 2e-07 0.0003957
3.747045 6.008733 40.54206 2e-07 0.0004081
x
BH
x
samples$sample_typetumor
x
glm
topfit_byS.qlf <- edgeR::topTags(fit.qlf, sort.by = "PValue", n = nrow(geo_data))

topfit_byS.qlf_sig <- length(which(topfit_byS.qlf$table$PValue < 0.01))
topfit_byS.qlf_sigadj <- length(which(topfit_byS.qlf$table$FDR < 0.05))

# ------------------------------------------------------------------------------------------------

# Design differential expression model according to sample type for limma
model_design <- model.matrix(~samples$sample_type)
minimalSet <- Biobase::ExpressionSet(heatmap_matrix)

fit <- limma::lmFit(minimalSet, model_design) %>% limma::eBayes(trend = TRUE)

topfit <- limma::topTable(fit, coef = ncol(model_design), adjust.method = "BH", number = nrow(geo_data))

topfit$hgnc_symbol <- rownames(topfit)

# Sort by pvalue
topfit_byS <- topfit[order(topfit$adj.P.Val), ]

# Showcase data
knitr::kable(topfit[1:10, ], type = "html", row.names = FALSE, caption = "Fitted with sample type by limma")
Fitted with sample type by limma
logFC AveExpr t P.Value adj.P.Val B hgnc_symbol
1.434683 0 3.973780 0.0000707 0.388953 0.7910178 ABCG1
-1.420668 0 -3.935681 0.0000830 0.388953 0.6739018 SSBP2
1.412642 0 3.912375 0.0000914 0.388953 0.6028155 PLPBP
-1.358095 0 -3.758380 0.0001710 0.388953 0.1437266 PER2
1.357057 0 3.758215 0.0001711 0.388953 0.1432447 ZMIZ2
-1.314630 0 -3.642877 0.0002696 0.388953 -0.1885169 PLD1
1.307934 0 3.623817 0.0002903 0.388953 -0.2423444 ZC3H3
-1.282341 0 -3.550517 0.0003845 0.388953 -0.4467255 TMEM164
1.277315 0 3.538598 0.0004023 0.388953 -0.4795648 CMTR1
1.277079 0 3.534289 0.0004089 0.388953 -0.4914086 METRNL
topfit_byS_sig <- length(which(topfit_byS$P.Value < 0.01))
topfit_byS_sigadj <- length(which(topfit_byS$adj.P.Val < 0.05))

# ------------------------------------------------------------------------------------------------

# Design differential expression model according to cell type + patient
model_design <- model.matrix(~samples$sample_type + samples$patient)
minimalSet <- Biobase::ExpressionSet(heatmap_matrix)

fit <- limma::lmFit(minimalSet, model_design) %>% limma::eBayes(trend = TRUE)

topfit <- limma::topTable(fit, coef = ncol(model_design), adjust.method = "BH", number = nrow(geo_data))

topfit$hgnc_symbol <- rownames(topfit)

# Sort by pvalue
topfit_bySP <- topfit[order(topfit$adj.P.Val), ]

# Showcase data
knitr::kable(topfit[1:10, ], type = "html", row.names = FALSE, caption = "Fitted with sample type & patient by limma")
Fitted with sample type & patient by limma
logFC AveExpr t P.Value adj.P.Val B hgnc_symbol
-4.280336 0 -4.677461 2.90e-06 0.0347214 4.069101 HMGB1P5
-4.204458 0 -4.596608 4.30e-06 0.0347214 3.733523 CHI3L2
-4.138095 0 -4.522847 6.10e-06 0.0347214 3.432483 IGHV5-51
-4.098641 0 -4.477752 7.50e-06 0.0347214 3.250837 IGHVIII-67-4
-4.086622 0 -4.462422 8.10e-06 0.0347214 3.189500 CXCL10
-4.039747 0 -4.411532 1.03e-05 0.0366475 2.987397 RNF17
-3.858813 0 -4.212958 2.52e-05 0.0764309 2.220949 PLSCR1
-3.824320 0 -4.180413 2.91e-05 0.0764309 2.098699 USP41
-3.756429 0 -4.105116 4.04e-05 0.0764309 1.819494 ISG20
-3.745168 0 -4.087864 4.35e-05 0.0764309 1.756237 IFIT1
topfit_bySP_sig <- length(which(topfit_bySP$P.Value < 0.01))
topfit_bySP_sigadj <- length(which(topfit_bySP$adj.P.Val < 0.05))

1. Calculate p-values for each of the genes in your expression set.

How many genes were significantly differentially expressed? What thresholds did you use and why?

Here we calculate p-values for each of the genes in the expression set. The thresholds of 1% are chosen because our data set is consists of> 20,000 genes. If 1% of these are included by random chance, there will be around 200 false positives, which is not ideal for the downstream threshold over-representation analysis. However, I choose to use 5% for the adjusted P-value since it is usually enough for RNA-seq analysis.

The simple model (model using sample along by limma) labels 534 genes with significant p values from the analysis above. In the meantime, there are 499 genes with significant p values in the complex model (using sample + patient by limma). Additionally, the simple model by edgeR produces 1373 genes with significant p values.

2. Multiple hypothesis testing - correct your p-values using multiple hypothesis correction methods.

Which method did you use? And Why? How many genes passed correction?

I choose to go with empirical Bayes moderation in the limma package and quasi-likelihood negative binomial generalized log-linear model for multiple hypothesis correction.

The simple model (model using sample along by limma) labels 0 genes with significant adjusted p values from the analysis above. In the meantime, there are 6 genes with significant adjusted p values in the complex model (using sample + patient by limma). Additionally, the simple model by edgeR produces 465 genes with significant adjusted p values.

Eventually, I decided to use the QLF method based on the observation below:

Model selection

Simple versus complex (Limma)

# Model comparison
simple_model_pvalues <- data.frame(hgnc_symbol = topfit_byS$hgnc_symbol, simple_pvalue = topfit_byS$P.Value)

pat_model_pvalues <- data.frame(hgnc_symbol = topfit_bySP$hgnc_symbol, patient_pvalue = topfit_bySP$P.Value)

two_models_pvalues <- merge(simple_model_pvalues, pat_model_pvalues, by.x = 1, by.y = 1)

# Plot across simple & complex model
two_models_pvalues$colour <- "black"
two_models_pvalues$colour[two_models_pvalues$simple_pvalue < 0.05] <- "orange"
two_models_pvalues$colour[two_models_pvalues$patient_pvalue < 0.05] <- "blue"
two_models_pvalues$colour[two_models_pvalues$simple_pvalue < 0.05 & two_models_pvalues$patient_pvalue < 
    0.05] <- "red"

plot(two_models_pvalues$simple_pvalue, two_models_pvalues$patient_pvalue, col = two_models_pvalues$colour, 
    xlab = "simple model p-values", ylab = "complex model p-values", main = "Simple vs Complex Limma")
legend("topleft", inset = 0.01, legend = c("Simple", "Complex", "Both", "Not Signif"), 
    fill = c("orange", "blue", "red", "black"))

# ------------------------------------------------------------------------------------------------

# Plot with genes highlighted on the study

two_models_pvalues$colour <- "grey"

plot(two_models_pvalues$simple_pvalue, two_models_pvalues$patient_pvalue, col = two_models_pvalues$colour, 
    xlab = "simple model p-values", ylab = "complex model p-values", main = "Simple vs Complex Limma with CCR")
points(two_models_pvalues[which(grepl("CCR", two_models_pvalues$hgnc_symbol)), 2:3], 
    pch = 20, col = "red", cex = 1.5)
legend("topleft", inset = 0.01, legend = c("CCR", "rest"), fill = c("red", "grey"))

The complex and simple limma P-value plot shows an apparent disagreement between the sample and the complex model. This aligns with our MDS plots shown in the previous steps. The complex model introduced another layer of information and altered the thresholds list to include lots of genes which is not significantly expressed by looking at their samples alone. Additionally, CCR genes, the highlighted genes stated in the research, are labelled in the second plot. It is more cluster under the simple model versus the complex model. Therefore, I choose to use the simple model to align with the factor of interest.

Limma vs EdgeR method

# Model comparison
limma_model_pvalues <- data.frame(hgnc_symbol = topfit_byS$hgnc_symbol, limma_pvalue = topfit_byS$P.Value)

edgeR_model_pvalues <- data.frame(hgnc_symbol = rownames(topfit_byS.qlf$table), edgeR_pvalue = topfit_byS.qlf$table$PValue)

two_models_pvalues <- merge(limma_model_pvalues, edgeR_model_pvalues, by.x = 1, by.y = 1)

# Plot across simple & complex model
two_models_pvalues$colour <- "black"
two_models_pvalues$colour[two_models_pvalues$limma_pvalue < 0.05] <- "orange"
two_models_pvalues$colour[two_models_pvalues$edgeR_pvalue < 0.05] <- "blue"
two_models_pvalues$colour[two_models_pvalues$limma_pvalue < 0.05 & two_models_pvalues$edgeR_pvalue < 
    0.05] <- "red"

plot(two_models_pvalues$limma_pvalue, two_models_pvalues$edgeR_pvalue, col = two_models_pvalues$colour, 
    xlab = "Limma model p-values", ylab = "EdgeR model p-values", main = "Limma vs EdgeR model by sample type")
legend("topleft", inset = 0.01, legend = c("Simple", "Complex", "Both", "Not Signif"), 
    fill = c("orange", "blue", "red", "black"))

# ------------------------------------------------------------------------------------------------

# Plot with genes highlighted on the study

two_models_pvalues$colour <- "grey"

plot(two_models_pvalues$limma_pvalue, two_models_pvalues$edgeR_pvalue, col = two_models_pvalues$colour, 
    xlab = "Limma model p-values", ylab = "EdgeR model p-values", main = "Limma vs EdgeR by sample type CCR labelled")
points(two_models_pvalues[which(grepl("CCR", two_models_pvalues$hgnc_symbol)), 2:3], 
    pch = 20, col = "red", cex = 1.5)
legend("topleft", inset = 0.01, legend = c("CCR", "rest"), fill = c("red", "grey"))

From the first scatter plot, we can see that the model produced by limma and edgeR shares more similarity than the complex model discussed above. The scatter plot labelled with CCR genes shows that more of the CCR gene are label significant under the edgeR package comparing to the limma model. Eventually, I prefer the edgeR model Since it identifies more genes under significant range while aligning better with the original research.

Model results

# Plotting using result from samples_type edgeR

# Clustering samples by sample type
tumor <- geo_data[, grepl("tumor", colnames(geo_data))]
NBP <- geo_data[, grepl("NBP", colnames(geo_data))]
geo_data_avg <- data.frame(hgnc_symbol = gsub("-", ".", rownames(geo_data)), Tumor = rowMeans(tumor), 
    NBP = rowMeans(NBP))

# Construct the edgeR result table
topfit_byS.qlf.result <- cbind(topfit_byS.qlf$table, hgnc_symbol = rownames(topfit_byS.qlf$table))

# Merge it with the fit result
geo_data_byS <- merge(geo_data_avg, topfit_byS.qlf.result, by.x = "hgnc_symbol", 
    by.y = "hgnc_symbol", all = TRUE)

# Count under/over express genes
under_exp <- sum(geo_data_byS$logFC < 0 & geo_data_byS$PValue < 0.01)
over_exp <- sum(geo_data_byS$logFC > 0 & geo_data_byS$PValue < 0.01)

# Prepare coloring on the plots
status <- rep(0, nrow(geo_data_byS))
status[geo_data_byS$logFC < 0 & geo_data_byS$PValue < 0.01] <- -1
status[geo_data_byS$logFC > 0 & geo_data_byS$PValue < 0.01] <- 1

limma::plotMD(log2(geo_data_byS[, c(2, 3)]), status = status, values = c(-1, 1), 
    hl.col = c("blue", "red"), main = "MA plots (signifi labelled with edgeR)")

# ------------------------------------------------------------------------------------------------

# Plotting using result from samples_type + patient model
summa.fit <- limma::decideTests(fit)
par(mfrow = c(2, 2))
limma::plotMD(fit, coef = 4, status = summa.fit[, "samples$patientpatient2"], values = c(-1, 
    1), hl.col = c("blue", "red"), main = "MA plots for Patient 1 vs 2")
limma::plotMD(fit, coef = 3, status = summa.fit[, "samples$patientpatient10"], values = c(-1, 
    1), hl.col = c("blue", "red"), main = "MA plots for Patient 1 vs 10")
limma::plotMD(fit, coef = 5, status = summa.fit[, "samples$patientpatient3"], values = c(-1, 
    1), hl.col = c("blue", "red"), main = "MA plots for Patient 1 vs 3")
limma::plotMD(fit, coef = 6, status = summa.fit[, "samples$patientpatient4"], values = c(-1, 
    1), hl.col = c("blue", "red"), main = "MA plots for Patient 1 vs 4")

3. Show the amount of differentially expressed genes using an MA Plot or a Volcano plot.

Highlight genes of interest.

As shown above, we plotted the MA plot using the result from the edgeR package. Downregulated genes are labelled in blue and unregulated genes are marked in red. There are approximately NA downregulated genes and NA upregulated genes. Additionally, I also plotted the MA plot from the complex model for patients 1, 3, 4 and 10. It is worth noting that there are quite a few of the genes down/upregulation among each patient.

# Subset the threshold gene
top_hits <- topfit_byS.qlf.result$hgnc_symbol[topfit_byS.qlf.result$PValue < 0.01]

# Create and order the heatmap matrix
heatmap_matrix_tophits <- heatmap_matrix[which(rownames(heatmap_matrix) %in% top_hits), 
    ]
heatmap_matrix_tophits <- heatmap_matrix_tophits[, c(grep("tumor", colnames(heatmap_matrix_tophits)), 
    grep("NBP", colnames(heatmap_matrix_tophits)))]

# Create heatmap attribute
heatmap_col = circlize::colorRamp2(c(min(heatmap_matrix_tophits), 0, max(heatmap_matrix_tophits)), 
    c("blue", "white", "red"))

ha_colours <- c("darkgreen", "darkblue")
names(ha_colours) <- c("tumor", "NBP")

ha <- ComplexHeatmap::HeatmapAnnotation(df = data.frame(type = rep(c("tumor", "NBP"), 
    c(22, 12))), col = list(type = ha_colours))

# Create heatmap
signif_heatmap <- ComplexHeatmap::Heatmap(as.matrix(heatmap_matrix_tophits), cluster_rows = TRUE, 
    cluster_columns = FALSE, show_row_dend = TRUE, show_column_dend = FALSE, col = heatmap_col, 
    show_column_names = TRUE, show_row_names = FALSE, show_heatmap_legend = TRUE, 
    top_annotation = ha)

signif_heatmap

Question 4

  1. Visualize your top hits using a heatmap. Do you conditions cluster together? Explain why or why not.

Yes, there is an apparent clustering between tumor and NBP samples. When looking at the samples labelled in dark blue from the left, there is clustering on the lower side of the heatmap, and in the meantime, a significant red clustering exists on the upper side of the tumor samples labelled in dark green.

Thresholded over-representation analysis

# Calculate up/down regulated genes
upregulated_genes <- topfit_byS.qlf.result$hgnc_symbol[which(topfit_byS.qlf.result$PValue < 
    0.01 & topfit_byS.qlf.result$logFC > 0)]
downregulated_genes <- topfit_byS.qlf.result$hgnc_symbol[which(topfit_byS.qlf.result$PValue < 
    0.01 & topfit_byS.qlf.result$logFC < 0)]

# Export gene tables
write.table(x = upregulated_genes, file = file.path("data", "mets_upregulated_genes.txt"), 
    sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(x = downregulated_genes, file = file.path("data", "mets_downregulated_genes.txt"), 
    sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)

1. Which method did you choose and why?

The method I used is the Benjamini-Hochberg FDR method under g:profiler (Raudvere et al. 2019). G:profiler is a constantly updating annotation tools, and has connection with many human genome databases. I found it helpful in annotating cancer-related data.

2. What annotation data did you use and why? What version of the annotation are you using?

I choose to use the following four annotation data: GO: Biological Process - Release 2020-12-08 (Mi et al. 2018) KEGG – Release 2020-12-14 (Kanehisa and Goto 2000) Reactome – Release 2020-12-15 (Fabregat et al. 2017) WikiPathways – Release 2020-12-10 (Martens et al. 2020) Since they are both biological pathway databases with relatively recent updates, it will be helpful to look at results from all of them. Additionally, the original paper uses GO as its source of pathway analysis. (Plitas et al. 2016)

3. How many genesets were returned with what thresholds?

I choose to use thresholds of 5%, and it returns 223 genesets under the GO Biological process, six under KEGG, six under Reactome and two under WikiPathways. The detailed results are listed below: g:Profiler complete thresholded list result summary g:Profiler complete thresholded list detail result

4. Run the analysis using the up-regulated set of genes, and the down-regulated set of genes separately.

How do these results compare to using the whole list (i.e all differentially expressed genes together vs. the up-regulated and down regulated differentially expressed genes separately)?

g:Profiler downregulated list result summary g:Profiler downregulated list detail result g:Profiler upregulated list result summary g:Profiler upregulated list detail result (GO) g:Profiler upregulated list detail result (KEGG/REACT/WP)

From the result above, it is clear to see that the upregulated gene is more significantly marked by the g:profiler, while the downregulated genes are not. This finding is echoing with the MA plots for patients, where it shows many “downregulated genes” exist between each patient. These individual variations are primarily due to randomness. Hence it is less likely to be correlated to an exact pathway. The whole list shares similar genesets with the upregulated list, where “cytokine” related genesets are highlighted.

Interpretation

1. Do the over-representation results support conclusions or mechanism discussed in the original paper?

The original paper conducted GO pathway enrichment analysis, the exact mechanism and databases used in our study. The over-representation results align with the output from the original paper where “a number of cytokine and chemokine receptor genes, most notably CCR8, were upregulated in tumor-resident Treg cells in comparison to normal tissue resident ones.” (Plitas et al. 2016) which is precisely the case shown in our analysis. However, the CCR8 gene is highlighted in the original paper, but it is not part of our thresholded list. This could be because of differences in choosing significant levels (the original article decide to use a = 0.05) and differences in gene mappings. The mapping list is slightly different in 2021 compared to 2017, when the paper was initially published.

2. Can you find evidence, i.e. publications, to support some of the results that you see.

How does this evidence support your results.

One of the results I found is that the immune response and Type 1 IFNs pathway are upregulated in the tumor cells. This is seen in another study proposing the mechanism through which BRCA1 regulates IFN-γ–dependent signalling, which induces the upregulation. (Buckley et al. 2007) Hence this could affect the downstream pathway in the tumor-residenting Treg cell, therefore showing upregulation of type 1 IFNs pathway in our tumor samples.

Another result I found is the regulation of genome replication. A review article on DNA damage response (DDR) states that DDR activities the regulation of genome replication; it is upregulated in the tumor breast cancer cell. In the meantime, “DNA damage results in an increase in levels of inflammatory cytokines, which activate type I interferons which are known to augment cytotoxic T-cell priming.” (Minchom, Aversa, and Lopez 2018) The upregulation of genes associated with genome replication in Treg may be coming from the tumor cell in the form of contamination.

References

Buckley, Niamh E., Alison M. Hosey, Julia J. Gorski, James W. Purcell, Jude M. Mulligan, D. Paul Harkin, and Paul B. Mullan. 2007. “BRCA1 Regulates Ifn-γ Signaling Through a Mechanism Involving the Type I Ifns.” Molecular Cancer Research 5 (3): 261–70. https://doi.org/10.1158/1541-7786.MCR-06-0250.

Durinck, Steffen, Yves Moreau, Arek Kasprzyk, Sean Davis, Bart De Moor, Alvis Brazma, and Wolfgang Huber. 2005. “BioMart and Bioconductor: A Powerful Link Between Biological Databases and Microarray Data Analysis.” Bioinformatics 21: 3439–40.

Fabregat, Antonio, Steven Jupe, Lisa Matthews, Konstantinos Sidiropoulos, Marc Gillespie, Phani Garapati, Robin Haw, et al. 2017. “The Reactome Pathway Knowledgebase.” Nucleic Acids Research 46 (D1): D649–D655. https://doi.org/10.1093/nar/gkx1132.

Gu, Zuguang, Roland Eils, and Matthias Schlesner. 2016. “Complex Heatmaps Reveal Patterns and Correlations in Multidimensional Genomic Data.” Bioinformatics.

Gu, Zuguang, Lei Gu, Roland Eils, Matthias Schlesner, and Benedikt Brors. 2014. “Circlize Implements and Enhances Circular Visualization in R.” Bioinformatics 30 (19): 2811–2.

Kanehisa, Minoru, and Susumu Goto. 2000. “KEGG: Kyoto Encyclopedia of Genes and Genomes.” Nucleic Acids Research 28 (1): 27–30. https://doi.org/10.1093/nar/28.1.27.

Martens, Marvin, Ammar Ammar, Anders Riutta, Andra Waagmeester, Denise N Slenter, Kristina Hanspers, Ryan A. Miller, et al. 2020. “WikiPathways: connecting communities.” Nucleic Acids Research 49 (D1): D613–D621. https://doi.org/10.1093/nar/gkaa1024.

Mi, Huaiyu, Anushya Muruganujan, Dustin Ebert, Xiaosong Huang, and Paul D Thomas. 2018. “PANTHER version 14: more genomes, a new PANTHER GO-slim and improvements in enrichment analysis tools.” Nucleic Acids Research 47 (D1): D419–D426. https://doi.org/10.1093/nar/gky1038.

Minchom, Anna, Caterina Aversa, and Juanita Lopez. 2018. “Dancing with the Dna Damage Response: Next-Generation Anti-Cancer Therapeutic Strategies.” Therapeutic Advances in Medical Oncology 10: 1758835918786658. https://doi.org/10.1177/1758835918786658.

Morgan, Martin. 2019. BiocManager: Access the Bioconductor Project Package Repository. https://CRAN.R-project.org/package=BiocManager.

Plitas, George, Catherine Konopacki, Kenmin Wu, Paula D. Bos, Monica Morrow, Ekaterina V. Putintseva, Dmitriy M. Chudakov, and Alexander Y. Rudensky. 2016. “Regulatory T Cells Exhibit Distinct Features in Human Breast Cancer.” Immunity 45 (5): 1122–34. https://doi.org/https://doi.org/10.1016/j.immuni.2016.10.032.

Raudvere, Uku, Liis Kolberg, Ivan Kuzmin, Tambet Arak, Priit Adler, Hedi Peterson, and Jaak Vilo. 2019. “g:Profiler: a web server for functional enrichment analysis and conversions of gene lists (2019 update).” Nucleic Acids Research 47 (W1): W191–W198. https://doi.org/10.1093/nar/gkz369.

Ritchie, Matthew E, Belinda Phipson, Di Wu, Yifang Hu, Charity W Law, Wei Shi, and Gordon K Smyth. 2015. “limma Powers Differential Expression Analyses for RNA-Sequencing and Microarray Studies.” Nucleic Acids Research 43 (7): e47. https://doi.org/10.1093/nar/gkv007.

Robinson, Mark D, Davis J McCarthy, and Gordon K Smyth. 2010. “EdgeR: A Bioconductor Package for Differential Expression Analysis of Digital Gene Expression Data.” Bioinformatics 26 (1): 139–40. https://doi.org/10.1093/bioinformatics/btp616.

R Special Interest Group on Databases (R-SIG-DB), Hadley Wickham, and Kirill Müller. 2019. DBI: R Database Interface. https://CRAN.R-project.org/package=DBI.

Wickham, Hadley. 2016. Ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York. https://ggplot2.tidyverse.org.

———. 2020. Tidyr: Tidy Messy Data. https://CRAN.R-project.org/package=tidyr.

Wickham, Hadley, Romain François, Lionel Henry, and Kirill Müller. 2020. Dplyr: A Grammar of Data Manipulation. https://CRAN.R-project.org/package=dplyr.

Wilke, Claus O. 2020. Cowplot: Streamlined Plot Theme and Plot Annotations for ’Ggplot2’. https://CRAN.R-project.org/package=cowplot.

Zhu, Yuelin, Sean Davis, Robert Stephens, Paul S. Meltzer, and Yidong Chen. 2008. “GEOmetadb: Powerful Alternative Search Engine for the Gene Expression Omnibus.” Bioinformatics (Oxford, England) 24 (23): 2798–2800. https://doi.org/10.1093/bioinformatics/btn520.